{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 特征0：0：青年，1：中年，2：老年\n",
    "# 特征1：0：无工作，1：有工作\n",
    "# 特征2：0：无房产，1：有房产\n",
    "# 特征3：0：信贷一般，1：信贷好，2：信贷非常好\n",
    "X = np.array([[0,0,0,0],\n",
    "              [0,0,0,1],\n",
    "              [0,1,0,1],\n",
    "              [0,1,1,0],\n",
    "              [0,0,0,0],\n",
    "              [1,0,0,0],\n",
    "              [1,0,0,1],\n",
    "              [1,1,1,1],\n",
    "              [1,0,1,2],\n",
    "              [1,0,1,2],\n",
    "              [2,0,1,2],\n",
    "              [2,0,1,1],\n",
    "              [2,1,0,1],\n",
    "              [2,1,0,2],\n",
    "              [2,0,0,0]])\n",
    "# 标签：0：不贷款，1：贷款\n",
    "y = np.array([0,0,1,1,0,0,0,1,1,1,1,1,1,1,0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 特征和标签的可取值范围：\n",
    "def H(y):\n",
    "    sum = 0\n",
    "    # 计算y可取到的值\n",
    "    k = set(y)\n",
    "    for ck in k:\n",
    "        Pk = y[y==ck].shape[0] / y.shape[0]\n",
    "        if Pk != 0:\n",
    "            sum -= Pk * np.log2(Pk)\n",
    "    return sum\n",
    "\n",
    "def svm(X, y, feature):\n",
    "    # 计算X的每个特征可取到的值\n",
    "    a = set(X[:,feature])\n",
    "    # 计算数据集的经验熵\n",
    "    HD = H(y)\n",
    "    # 计算特征A对数据集D的经验条件熵H(D|A)\n",
    "    HDA = 0\n",
    "    for value in a:\n",
    "        yDi = y[X[:,feature]==value]\n",
    "        HDA += yDi.shape[0]/y.shape[0] * H(yDi)\n",
    "    return HD - HDA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.08300749985576883"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "svm(X, y, 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.4199730940219749"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "svm(X, y, 2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "def multi(y):\n",
    "    ySet = set(y)\n",
    "    bestCount = 0\n",
    "    for yi in ySet:\n",
    "        count = y.count(yi)\n",
    "        if count > bestCount:\n",
    "            bestCount = count\n",
    "            bestyi = yi\n",
    "    return bestyi\n",
    "\n",
    "def ID3(X, y, epsilon):\n",
    "    # 若D中所有实例属于同一类\n",
    "    if len(set(y))==1:\n",
    "        # 将类$$C_k$$作为该结点的类标记\n",
    "        return y[0]\n",
    "    # 若$$A=\\emptyset$$\n",
    "    if X.shape[1] == 0:\n",
    "        # 实例数最大的类$$C_k$$作为该结点的类标记\n",
    "        return multi(y)\n",
    "    bestInfo = 0\n",
    "    # 计算A中各个特征对D的信息增益\n",
    "    for feature in range(X.shape[1]):\n",
    "        info = svm(X, y, feature)\n",
    "        # 选择信息特征最大的Ag\n",
    "        if svm(X, y, feature) > bestInfo:\n",
    "            bestInfo = info\n",
    "            bestfeature = feature\n",
    "    # 如果Ag的信息增益小于阈值$$\\epsilon$$\n",
    "    if bestInfo < epsilon:\n",
    "        # 将D中实例数最大的类$$C_k$$作为该结点的类标记\n",
    "        return multi(y)\n",
    "    feature = bestfeature\n",
    "    ret = {'feature':feature}\n",
    "    # 对Ag的每一个可能的值ai\n",
    "    a = set(X[:, feature])\n",
    "    for ai in a:\n",
    "        yai = y[X[:,feature] == ai]\n",
    "        Xai = X[X[:,feature] == ai]\n",
    "        ret[ai] = ID3(Xai, yai, epsilon)\n",
    "    return ret"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'feature': 2, 0: {'feature': 1, 0: 0, 1: 1}, 1: 1}"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ID3(X, y, 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'feature': 2,\n",
       " 'parent': None,\n",
       " 'child': {0: {'feature': 1, 'parent': {...}, 'child': {0: 0, 1: 1}}, 1: 1}}"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "C45(None, X, y, 0)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
